import os
import sys
import gzip
from collections import defaultdict
from Bio import SeqIO

condition = sys.argv[1]

filename = "%s.bam.fa" % condition
print("Reading", filename)
stream = open(filename)
records = SeqIO.parse(stream, "fasta")
names = defaultdict(list)
for record in records:
    name = record.id
    sequence = str(record.seq)
    names[sequence].append(name)
stream.close()

def generate():
    directory = "/osc-fs_home/mdehoon/Data/CASPARs/CAGE/Fasta"
    filename = "%s.fa.gz" % condition
    path = os.path.join(directory, filename)
    print("Reading", path)
    stream = gzip.open(path, "rt")
    records = SeqIO.parse(stream, "fasta")
    for record in records:
        sequence = str(record.seq)
        name = names[sequence].pop()
        if len(names[sequence]) == 0:
            del names[sequence]
        record.id = name
        record.name = ""
        record.description = ""
        yield record
    stream.close()
    assert len(names) == 0
       
filename = "%s.fa" % condition
print("Writing", filename)
stream = open(filename, "wt")
records = generate()
SeqIO.write(records, stream, "fasta")
stream.close()
